# Dickstein, Ho, and Mark (2023)
# This code creates a final sample household-level dataset by combining the
# household-level data from OHA's APAC dataset to the number of uninsured households
# in groups of households defined by their characteristics.

# * # * # * # * # * # * #
# PRELIMINARIES         #
# * # * # * # * # * # * #

setwd("../library")
source("PreliminariesCode.R")

# * # * # * # * # * # * #
# LOADING DATA          #
# * # * # * # * # * # * #

## Loading Individual Market Subscriber Data
SubDat <- fread("grouped_subs.csv")
print(paste0("There are ", nrow(SubDat), " subscriber observations when loaded."))
names(SubDat)[which(names(SubDat) == "constructed_plan_year")] <- "chosen_plan_id"

## Loading Uninsured Data
UninDat <- fread("3_uninsured_rates.csv")
print(paste0("There are ", sum(SubDat$pop_unins_post, na.rm = T), " uninsured households when loaded."))

## Creating Group ID and group ID 2: 
group_vars2 <- c("year",
                "best_guess_ra",
                "age_bins",
                "married",
                "withkids")
SubDat[, group_id2 := .GRP, by = group_vars2]

## Collapsing to group level
GrpDat <- unique(SubDat[, .(year, age_bins, best_guess_ra, fpl_bins, acg_quartiles, married = as.numeric(married), withkids = as.numeric(withkids), group_id), ])
GrpDat2 <- unique(SubDat[, .(year, age_bins, best_guess_ra, married = as.numeric(married), withkids = as.numeric(withkids), group_id2), ])

## Merging onto uninsured data:
UninDat <- merge(UninDat, 
  GrpDat, 
  by.x = c("year", "age", "ra", "income", "acg_quartiles", "is_married", "has_children"), 
  by.y = c("year", "age_bins", "best_guess_ra", "fpl_bins", "acg_quartiles", "married", "withkids"), 
  all.x = T)
UninDat <- merge(UninDat, 
  GrpDat2, 
  by.x = c("year", "age", "ra", "is_married", "has_children"),
  by.y = c("year", "age_bins", "best_guess_ra", "married", "withkids"),
  all.x = T)

## Giving feedback. 
if(nrow(UninDat[!is.na(group_id) & is.na(group_id2)]) > 0){
  stop("Large groups must be sets of small groups.")
}
print(paste0("There are ", sum(UninDat$pop_unins_post), " uninsured households to start."))
print(paste0("There are ", sum(UninDat[is.na(group_id)]$pop_unins_post), " uninsured households who fit in a small bin with no insured households in it."))
print(paste0("There are ", sum(UninDat[is.na(group_id2)]$pop_unins_post), " uninsured households who fit in a large bin with no insured households in it. These are removed."))

# Removing uninsured observations with no corresponding group: 
UninDat <- UninDat[!is.na(group_id2)]

# Splitting uninsurance data into big group and little group datasets:
UninDat_smallgrp <- UninDat[!is.na(group_id)]
UninDat_largegrp <- UninDat[is.na(group_id), 
  .(pop_unins_post = sum(pop_unins_post)), 
  by = c("year", "age", "ra", "is_married", "has_children", "group_id2")]

# * # * # * # * # * # * # * # * # * #
# CLEANING For Final Analysis       #
# * # * # * # * # * # * # * # * # * #

# Keep the following variables:
var_list <- list()

# IDs:
## subscriberid, year
var_list$idvars <- c("subscriberid", "year")

# Choice Variables:
## constructed plan ID, and variable that go in it:
var_list$choice_vars <- c("chosen_plan_id")
var_list$choice_vars.backup <- c("markettype", "exog_exchange", "best_guess_ra", "payer_id", "best_guess_metal", "mnc_plantype")

# Non-discretionary spending variables:
## subscriber age, number of kids, number of spouses,
## mean ACG, max ACG
var_list$ndspendingvars <- c("age_bins_label", "withkids", "married", "acg_quartiles_label", "acg_max_quartiles_label")
var_list$ndspendingvars.backup <- c("age", "ndeps", "nspouse", "sum_concurrent_risk", "max_concurrent_risk")

# Discretionary spending (moral hazard) variables:
var_list$moralhazardvars <- c()
var_list$moralhazardvars.backup <- c()

# Risk Aversion variables:
## Income
var_list$riskavervars <- c("fpl_bins")
var_list$riskavervars.backup <- c("best_guess_incomeoverFPL")

# Cost Variables:
## Total Cost, Total OOP Cost
var_list$costvars <- c("totpaid", "totcopay", "totcoins", "totdeduct", "numzeroclaims", "nummonths_observed", "nummonths_span", "totpaidmonth_observed", "totpaidmonth_span")
SubDat <- SubDat[, c("group_id", "group_id2", unlist(var_list)), with = F]

# * # * # * # * # * # * # * # * # * # * # * # * # * # * #
# TAKING DRAWS OF CHARACTERISTICS FOR UNINSURED         #
# * # * # * # * # * # * # * # * # * # * # * # * # * # * #

# In SubDat, give each subscriber an id from 1 to group_size in each small group and large group:
SubDat[, gid := 1:.N, by = "group_id"]
SubDat[, gid2 := 1:.N, by = "group_id2"]

# For each uninsured group, expand to household-level, then take a draw of gid from 1 to the insured group size:
# SMALL GROUPS

UninDat_sg_Expanded <- UninDat_smallgrp[rep(1:.N, round(pop_unins_post, 0)), ]
UninDat_lg_Expanded <- UninDat_largegrp[rep(1:.N, round(pop_unins_post, 0)), ]

# Quality test
print("Should be close to zero:")
UninDat_sg_Expanded[, n := .N, by = "group_id"]
UninDat_lg_Expanded[, n := .N, by = "group_id2"]
print(mean(UninDat_sg_Expanded$n - UninDat_sg_Expanded$pop_unins_post, na.rm = T))
print(mean(UninDat_lg_Expanded$n - UninDat_lg_Expanded$pop_unins_post, na.rm = T))

# How big is each group in the insured files?
InGrpSize <- SubDat[, .(insured_grpsize = .N), by = "group_id"]
InGrpSize2 <- SubDat[, .(insured_grpsize2 = .N), by = "group_id2"]

# Merging
UninDat_sg_Expanded <- merge(UninDat_sg_Expanded,
                 InGrpSize,
                 by = "group_id",
                 all.x = T)
UninDat_lg_Expanded <- merge(UninDat_lg_Expanded,
                 InGrpSize2,
                 by = "group_id2",
                 all.x = T)
if(any(is.na(UninDat_sg_Expanded$insured_grpsize)) | any(is.na(UninDat_lg_Expanded$insured_grpsize2))){
  stop("Some uninsurance groups have no corresponding insured groups")
}
UninDat_sg_Expanded[, gid := sample(1:mode1(insured_grpsize), .N, replace = TRUE), by = "group_id"]
UninDat_lg_Expanded[, gid2 := sample(1:mode1(insured_grpsize2), .N, replace = TRUE), by = "group_id2"]

# Merge in the individual market files to give uninsured persons characteristics
UninDat_sg_Expanded <- merge(UninDat_sg_Expanded,
                 SubDat,
                 all.x = T,
                 by = c("group_id", "gid"),
                 suffixes = c(".bin_unin", ""))

UninDat_lg_Expanded <- merge(UninDat_lg_Expanded,
                 SubDat,
                 all.x = T,
                 by = c("group_id2", "gid2"),
                 suffixes = c(".bin_unin", ""))

# combine the two files:
UninDat <- rbind(UninDat_sg_Expanded, 
  UninDat_lg_Expanded, 
  fill = T)
print(paste0("There are ", nrow(UninDat), "uninsured households at the end."))

# Replace relevant variables:
UninDat[, payer_id := NA, ]
UninDat[, best_guess_metal := NA, ]
UninDat[, mnc_plantype := NA, ]
UninDat[, chosen_plan_id := "Outside_Option"]

# Clean to look equivalent:
UninDat <- UninDat[, c("group_id", "group_id2", unlist(var_list)), with = F]
SubDat$gid <- NULL
SubDat$gid2 <- NULL

# * # * # * # * # * # * # * # * # * #
# SAVING All Subscribers Data       #
# * # * # * # * # * # * # * # * # * #
TotDat <- rbind(SubDat, UninDat)
write.csv(TotDat, file = "finalsubsdata.csv", row.names = F)
